import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# define GPU for BERT model
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import pandas as pd
from matplotlib import pyplot as plt
import wordcloud
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
import spacy
from gensim import corpora
from gensim.models import LsiModel, LdaModel
from gensim.models.callbacks import PerplexityMetric
from tqdm.notebook import tqdm
import pyLDAvis.gensim
import gensim.downloader as api
from gensim.models import Word2Vec
import gensim
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from bokeh.io import output_notebook
import bokeh.models as bm, bokeh.plotting as pl
from nltk.tokenize import WordPunctTokenizer
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import umap
import hdbscan
from sklearn.cluster import DBSCAN
# from spacy.cli.download import download
# download(model="en_core_web_sm")
# nltk.download('stopwords')
df = pd.read_csv('emails.csv')
df.head(3)
msg = df['message'][0]
print(msg)
import email
'Email object keys: %s' % email.message_from_string(msg).keys()
def get_text_from_email(msg_text):
'''To get the content from email objects'''
msg = email.message_from_string(msg_text)
parts = [part.get_payload() for part in msg.walk() if \
part.get_content_type() == 'text/plain']
return ''.join(parts)
get_text_from_email(msg)
df['Content'] = list(map(get_text_from_email, df['message']))
messages = list(map(email.message_from_string, df['message']))
for key in ['Date', 'From', 'To', 'Subject']:
df[key] = [doc[key] for doc in messages]
def split_email_addresses(line):
'''To separate multiple email addresses'''
if line:
addrs = line.split(',')
addrs = frozenset(map(lambda x: x.strip(), addrs))
else:
addrs = None
return addrs
def convert_date(data):
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
return pd.to_datetime(data).tz_convert(None).strftime('%a %d %b %Y %H:%M:%S')
df['User'] = df['file'].map(lambda x: x.split('/')[0])
df.drop(['file', 'message'], axis=1, inplace=True)
df['From'] = df['From'].map(split_email_addresses)
df['To'] = df['To'].map(split_email_addresses)
df['Date'] = df['Date'].apply(convert_date)
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df.sample(3)
df['Content'].iloc[48]
def good_word(word):
for c in word:
if '0' <= c <= '9':
return False
return True
def clean_text(text):
x = text
text = nltk.RegexpTokenizer('\s+', gaps=True).tokenize(text.lower())
text = ' '.join([word for word in text if good_word(word)])
text = re.sub('<.*>', '', text) # text inside <>, some forwarded info
text = re.sub('\[.*\]', '', text) # text inside [], this is some attachements
text = re.sub('\S*@\S*\s?', '', text) # remote email addresses
text = re.sub('\S*/hou/\S*s?', '', text) # some corporation usernames
text = re.sub('http[s]?://\S+', '', text) # remove links
text = re.sub('www\.\S+', '', text) # remove links
# text = re.sub('.*subject:', '', text) # remain only text subject [works too long, let's implement]
msg_text = 'subject:'
idx = text.find(msg_text)
if idx != -1:
text = text[idx + len(msg_text):].strip()
return text
for field in ['Content', 'Subject']:
df[field] = [clean_text(email) for email in df[field]]
df['Content'].iloc[48]
sentence_lengths = [min(500, len(s.split())) for s in df['Content']]
cnt = 0
for x in sentence_lengths:
if x == 500:
cnt += 1
print('There are %s sentences with length >= 500' % cnt)
plt.figure(figsize=(11, 5))
plt.hist(sentence_lengths, bins=30, color='green')
plt.xlabel('Количество слов в письмах', fontsize=18)
plt.ylabel('Количество писем', fontsize=18)
plt.xlim(0, 400)
plt.show()
ax = df.groupby(df['Date'].dt.year)['Content'].count().plot(xlim=(1995, 2005), grid=True)
ax.set_xlabel('Year', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
ax = df.groupby(df['Date'].dt.dayofweek)['Content'].count().plot(grid=True)
ax.set_xlabel('Day of week', fontsize=15)
ax.set_ylabel('N emails', fontsize=15)
ax = df.groupby(df['Date'].dt.hour)['Content'].count().plot(grid=True)
ax.set_xlabel('Hour', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
def plot_wordcloud(text):
fig, ax = plt.subplots(figsize=(16, 10))
wc = wordcloud.WordCloud(width=1200,
height=750,
max_words=200,
stopwords=list(ENGLISH_STOP_WORDS),
background_color='white').generate(text)
ax.imshow(wc)
ax.axis("off")
# print(wc.words_)
subjects = ' '.join(df['Subject'])
plot_wordcloud(subjects)
contents = ' '.join(df.sample(5000)['Content'])
plot_wordcloud(contents)
cnt = defaultdict(int)
for item in df['From']:
sender = list(item)[0]
# sender = sender[:sender.rfind('.')]
cnt[sender] += 1
users, counts = zip(*sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:20])
plt.figure(figsize=(11, 5))
plt.barh(range(20), counts, align='center', color='green', alpha=0.8)
plt.yticks(range(20), users, fontsize=15)
plt.show()
bad_cnt = 0
for item in df['To']:
if item is None:
bad_cnt += 1
continue
for receiver in item:
# receiver = x[:x.rfind('.')]
cnt[receiver] += 1
print('There is %s emails without receivers' % bad_cnt)
users, counts = zip(*sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:20])
plt.figure(figsize=(11, 5))
plt.barh(range(20), counts, align='center', color='green', alpha=0.8)
plt.yticks(range(20), users, fontsize=15)
plt.show()
sub_df = df[['From', 'To', 'Date']].dropna()
print(sub_df.shape)
# drop emails sending to multiple addresses
sub_df = sub_df.loc[sub_df['To'].map(len) == 1]
print(sub_df.shape)
sub_df = sub_df.groupby(['From', 'To']).count().reset_index()
# unpack frozensets
sub_df['From'] = sub_df['From'].map(lambda x: next(iter(x)))
sub_df['To'] = sub_df['To'].map(lambda x: next(iter(x)))
# rename column
sub_df.rename(columns={'Date': 'count'}, inplace=True)
sub_df.sort_values('count', ascending=False).head(10)
import networkx as nx
G = nx.from_pandas_edgelist(sub_df, 'From', 'To', edge_attr='count', create_using=nx.DiGraph())
print('Number of nodes: %s, Number of edges: %s' % (G.number_of_nodes(), G.number_of_edges()))
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11, 5))
ax1.hist(list(dict(G.in_degree(weight='count')).values()), log=True, bins=20, color='green')
ax1.set_xlabel('Степень входа вершин', fontsize=18)
ax2.hist(list(dict(G.out_degree(weight='count')).values()), log=True, bins=20, color='green')
ax2.set_xlabel('Степень выхода вершин', fontsize=18)
comp_sizes = []
for nodes in nx.connected_components(G.to_undirected()):
comp_sizes.append(len(nodes))
print('There are %s connected components' % len(comp_sizes))
comp_sizes = np.array(comp_sizes)
cnt = defaultdict(int)
for x in comp_sizes:
cnt[x] += 1
keys = list(cnt.keys())
values = list(cnt.values())
order = np.argsort(values)[::-1]
keys = np.array(keys)[order]
values = np.array(values)[order]
plt.figure(figsize=(11, 5))
plt.barh(range(len(keys)), values, color='green', log=True, )
plt.yticks(range(len(keys)), keys, fontsize=15)
plt.ylabel('Размер компоненты связности', fontsize=18)
plt.xlabel('Количество', fontsize=18)
plt.show()
df = df.head(50000)
def normalize(word):
return nlp(word)[0].lemma_
nlp = spacy.load("en_core_web_sm")
normalize('dogs'), normalize('playing')
emails = list(df['Content'])
emails = [nltk.RegexpTokenizer(r'\w+').tokenize(email.lower()) for email in emails]
def only_eng_chars(word):
for c in word:
if c > 'z' or c < 'a':
return False
return True
def f(email_words):
return [normalize(word) for word in email_words if word not in stop_words and len(word) > 2 and only_eng_chars(word)]
stop_words = set(nltk.corpus.stopwords.words('english'))
n_cpu = 70
with Pool(n_cpu) as p:
emails = p.map(f, emails)
emails = [email_words for email_words in emails if len(email_words) > 2]
print(f'Total emails extracted: {len(emails)}')
dictionary = corpora.Dictionary(emails)
corpus = [dictionary.doc2bow(email_words) for email_words in emails]
def f(num_topics):
perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, callbacks=[perplexity_logger],
iterations=200, passes=10)
return perplexity_logger.get_value()
rng = range(3, 10)
n_cpu = len(rng)
with Pool(n_cpu) as p:
pps = p.map(f, rng)
plt.figure(figsize=(9, 6))
plt.plot(rng, pps)
plt.xlabel('Num topics')
plt.ylabel('Perplexity')
plt.title('Perplexity dependent by number of topics')
plt.grid()
plt.show()
def show(model):
str_topics = [topic_w for topic_number, topic_w in model.print_topics()]
str_topics_split = list(map(lambda x: x.split("+"), str_topics))
str_topics_split = [list(map(lambda x: x.split("*")[1].strip()[1:-1], elem)) for elem in str_topics_split]
for topic in str_topics_split:
print(', '.join(topic))
best_num_topics = rng[np.argmin(pps)]
lda = LdaModel(corpus, id2word=dictionary, num_topics=best_num_topics, iterations=200, passes=10)
show(lda)
data_lda = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(data_lda)
path = api.load("glove-twitter-50", return_path=True)
model = KeyedVectors.load_word2vec_format(path)
model_2 = Word2Vec(size=50, min_count=1)
model_2.build_vocab(emails)
total_examples = model_2.corpus_count
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format(path)
model_2.train(emails, total_examples=total_examples, epochs=5)
words = np.array(list(set((" ".join([" ".join(x) for x in emails])).split())))
word_vectors = []
bad_count = 0
for x in words:
try:
vec = model_2.wv.get_vector(x)
except KeyError:
bad_count += 1
vec = np.zeros(50)
word_vectors.append(vec)
print('There are %s words from %s without embeddings' % (bad_count, len(words)))
word_vectors = umap.UMAP(n_components=5).fit_transform(word_vectors)
word2vec_map = dict()
for i, word in enumerate(words):
word2vec_map[word] = word_vectors[i]
sentence_vectors = []
for email in emails:
vec = np.zeros(5)
for word in email:
# no exception, all words have embeddings
vec += word2vec_map[word]
vec /= len(email)
sentence_vectors.append(vec)
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=7, core_dist_n_jobs=70)
hdbscan_.fit(sentence_vectors)
labels = sorted(set(hdbscan_.labels_))
print('There are %s clusters' % len(labels))
cluster_vectors = []
sentence_vectors = np.array(sentence_vectors)
# first cluster in outliers
for label in labels[1:]:
vec = np.mean(sentence_vectors[hdbscan_.labels_ == label], axis=0)
cluster_vectors.append(vec)
cluster_vectors = np.array(cluster_vectors)
cluster_vectors.shape, word_vectors.shape
def show_cluster(num, cluster_size=10):
dists = np.sum((word_vectors - cluster_vectors[num]) ** 2, axis=1)
order = np.argsort(dists)[:cluster_size]
print(words[order])
# 3, 54, 110, 194
show_cluster(110)
show_cluster(54)
show_cluster(3)
show_cluster(194)
word_vectors_umap = umap.UMAP(n_components=2).fit_transform(word_vectors)
# word_vectors_pca = TSNE(n_components=2, n_jobs=40).fit_transform(word_vectors)
# word_vectors_pca = PCA(n_components=2).fit_transform(word_vectors)
word_vectors_umap = StandardScaler().fit_transform(word_vectors_umap)
output_notebook()
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
width=600, height=400, show=True, **kwargs):
""" draws an interactive plot for data points with auxilirary info on hover """
if isinstance(color, str): color = [color] * len(x)
data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })
fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)
fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
if show: pl.show(fig)
return fig
draw_vectors(word_vectors_umap[:, 0], word_vectors_umap[:, 1], token=words)
model_2.wv.most_similar('russia')
model_2.wv.most_similar(positive=['queen', 'boy'], negative=['girl'])
from bertopic import BERTopic
emails_bert = list(df['Content'])
len(emails_bert)
def tokenize(x):
return ' '.join(nltk_tokenizer.tokenize(x))
nltk_tokenizer = WordPunctTokenizer()
emails_bert = [tokenize(email) for email in emails_bert]
topic_model = BERTopic()
topics, _ = topic_model.fit_transform(emails_bert)
def show_topic(num):
return [x[0] for x in topic_model.get_topic(num) if x[1] > 0]
show_topic(366)
show_topic(261)
show_topic(327)
show_topic(1164)
show_topic(394)
topic_model.visualize_topics()